rm(list=ls())
require(quanteda)
require(stringi)
library(furrr)
library(purrr)
source("functions.R")

quanteda_options(threads = 60)
plan(multisession, workers = 32)

token_dir <- "data/"
token_files <- list.files(token_dir, pattern = "^tokens_.*\\.RDS$", full.names = TRUE)

toks_list <- future_map(token_files, function(f) {
  toks <- readRDS(f)
  tokens_remove(toks, "\\d", valuetype = "regex", min_nchar = 2)
})

toks <- do.call(c, toks_list)

# compound multiple words
dict_issue <- dict[["issue"]]

multi_terms <- unlist(dict_issue)
multi_terms <- multi_terms[grepl(" ", multi_terms)]

if (length(multi_terms) > 0) {
  toks <- tokens_compound(toks, phrase(multi_terms), concatenator = " ")
}


dfmt <- dfm(tokens_lookup(toks, dict_issue))
dfmt <- dfm_group(dfmt)

dat <- data.frame(docid = docnames(dfmt))
dat <- cbind(dat, convert(dfm_weight(dfmt, "boolean"), "data.frame"))

saveRDS(dat, "data/data_dictionary.RDS")
